{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AlbertTokenizer, BertModel, BertConfig, AutoTokenizer, AutoModelWithLMHead, pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "out = 'bert-base-bahasa'\n",
    "os.makedirs(out, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('bert-base-bahasa/spiece.model',\n",
       " 'bert-base-bahasa/special_tokens_map.json',\n",
       " 'bert-base-bahasa/added_tokens.json')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = AlbertTokenizer('sp10m.cased.bert.model', unk_token='[UNK]',pad_token='[PAD]', do_lower_case = False)\n",
    "tokenizer.save_pretrained('bert-base-bahasa')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AlbertTokenizer.from_pretrained('./bert-base-bahasa', \n",
    "                                            unk_token='[UNK]',pad_token='[PAD]', do_lower_case = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = BertConfig(f'tiny-bert-bahasa-cased-parliament/config.json')\n",
    "config.vocab_size = 32000\n",
    "config.hidden_size = 312\n",
    "config.intermediate_size = 1200"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "BertConfig {\n",
       "  \"architectures\": null,\n",
       "  \"attention_probs_dropout_prob\": 0.1,\n",
       "  \"bos_token_id\": null,\n",
       "  \"do_sample\": false,\n",
       "  \"eos_token_ids\": null,\n",
       "  \"finetuning_task\": null,\n",
       "  \"hidden_act\": \"gelu\",\n",
       "  \"hidden_dropout_prob\": 0.1,\n",
       "  \"hidden_size\": 312,\n",
       "  \"id2label\": {\n",
       "    \"0\": \"LABEL_0\",\n",
       "    \"1\": \"LABEL_1\"\n",
       "  },\n",
       "  \"initializer_range\": 0.02,\n",
       "  \"intermediate_size\": 1200,\n",
       "  \"is_decoder\": false,\n",
       "  \"label2id\": {\n",
       "    \"LABEL_0\": 0,\n",
       "    \"LABEL_1\": 1\n",
       "  },\n",
       "  \"layer_norm_eps\": 1e-12,\n",
       "  \"length_penalty\": 1.0,\n",
       "  \"max_length\": 20,\n",
       "  \"max_position_embeddings\": 512,\n",
       "  \"model_type\": \"bert\",\n",
       "  \"num_attention_heads\": 12,\n",
       "  \"num_beams\": 1,\n",
       "  \"num_hidden_layers\": 12,\n",
       "  \"num_labels\": 2,\n",
       "  \"num_return_sequences\": 1,\n",
       "  \"output_attentions\": false,\n",
       "  \"output_hidden_states\": false,\n",
       "  \"output_past\": true,\n",
       "  \"pad_token_id\": null,\n",
       "  \"pruned_heads\": {},\n",
       "  \"repetition_penalty\": 1.0,\n",
       "  \"temperature\": 1.0,\n",
       "  \"top_k\": 50,\n",
       "  \"top_p\": 1.0,\n",
       "  \"torchscript\": false,\n",
       "  \"type_vocab_size\": 2,\n",
       "  \"use_bfloat16\": false,\n",
       "  \"vocab_size\": 32000\n",
       "}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = AutoModelWithLMHead.from_pretrained('./tiny-bert-bahasa-cased-combined/pytorch_model.bin', config = config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'sequence': '[CLS] makan ayam dengan menengah[SEP]',\n",
       "  'score': 0.00014604821626562625,\n",
       "  'token': 4705},\n",
       " {'sequence': '[CLS] makan ayam dengan berita[SEP]',\n",
       "  'score': 0.00014147661568131298,\n",
       "  'token': 1395},\n",
       " {'sequence': '[CLS] makan ayam dengan diakses[SEP]',\n",
       "  'score': 0.00014008581638336182,\n",
       "  'token': 22560},\n",
       " {'sequence': '[CLS] makan ayam dengan bertutur[SEP]',\n",
       "  'score': 0.00013591423339676112,\n",
       "  'token': 20570},\n",
       " {'sequence': '[CLS] makan ayam dengan percaya[SEP]',\n",
       "  'score': 0.0001338910369668156,\n",
       "  'token': 937}]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fill_mask('makan ayam dengan [MASK]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tiny-bert-bahasa-cased-combined/\n",
      "tiny-bert-bahasa-cased-combined/pytorch_model.bin\n",
      "tiny-bert-bahasa-cased-combined/log.txt\n",
      "tiny-bert-bahasa-cased-combined/config.json\n",
      "tiny-bert-bahasa-cased-combined/.ipynb_checkpoints/\n"
     ]
    }
   ],
   "source": [
    "!tar cvzf tiny-bert-01-04-2020-twitter.tar.gz tiny-bert-bahasa-cased-combined"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "\n",
    "bucketName = 'huseinhouse-storage'\n",
    "Key = 'tiny-bert-01-04-2020-twitter.tar.gz'\n",
    "outPutname = \"bert-bahasa/tiny-bert-01-04-2020-twitter.tar.gz\"\n",
    "\n",
    "s3 = boto3.client('s3')\n",
    "s3.upload_file(Key,bucketName,outPutname)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
